### Homework 2
library(devtools)
## Loading required package: usethis
library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
### Task 1
# 1.a
setwd("~/Downloads/UniStuff/IE 582/HW2")
muskdata <- read.csv(file="MUSK1.csv",header=TRUE)
pca <- princomp(muskdata[3:168], center = TRUE, scale = TRUE)
## Warning: In princomp.default(muskdata[3:168], center = TRUE, scale = TRUE) :
## extra arguments 'center', 'scale' will be disregarded
str(pca)
## List of 7
## $ sdev : Named num [1:166] 673 379 279 248 221 ...
## ..- attr(*, "names")= chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ loadings: 'loadings' num [1:166, 1:166] 0.006074 0.059197 0.066313 -0.07226 -0.000118 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ center : Named num [1:166] 38.7 -120 -79.2 16.1 -112.3 ...
## ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## $ scale : Named num [1:166] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## $ n.obs : int 475
## $ scores : num [1:475, 1:166] 133.1 52.3 201.7 146.5 163 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ call : language princomp(x = muskdata[3:168], center = TRUE, scale = TRUE)
## - attr(*, "class")= chr "princomp"
plot(pca$scores[,1],pca$scores[,2],col=muskdata$X1+1,pch=".", cex=7, xlab="First Component", ylab="Second Component", main="PCA")

# If we plot scores of first and second component (with their bag labels color coded), we observe that there are three main clusters. Two of those have both musk and non-musk labels mixed, meaning first two components -even though they can explain 54% of the variance- fail to explain label distribution for these two clusters. The other main cluster consists solely of confirmed musk molecules. The rest of the data which consist of smaller clusters and some random looking elements seems to be seperated with respect to their labels correctly. (No color mix)
ggbiplot(pca)

par(mfrow=c(1,1))
screeplot(pca, type = "l", npcs = 25, main = "Screeplot of the first 25 PCs")
abline(h = 1, col="red", lty=5)

# In this screeplot, red line equals 1, indicating that the data points falling below red line have eigenvalues less than 1 - which occurs after 90% cumulative variance threshold. That split occurs between components 16 and 21.
par(mfrow=c(1,2))
comp.var <- (pca$sdev)^2
comp.var[1:10]
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## 453210.10 143585.73 77718.16 61456.91 49013.16 46715.06 30658.10
## Comp.8 Comp.9 Comp.10
## 23520.93 18169.62 17192.83
p.comp.var <- comp.var/(sum(comp.var))
p.comp.var[1:20]
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## 0.407454310 0.129089415 0.069871788 0.055252262 0.044064828 0.041998743
## Comp.7 Comp.8 Comp.9 Comp.10 Comp.11 Comp.12
## 0.027562882 0.021146269 0.016335226 0.015457054 0.012735485 0.011581332
## Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18
## 0.011051607 0.008871376 0.008190991 0.007695100 0.006792809 0.006371665
## Comp.19 Comp.20
## 0.006102287 0.005670939
sum(p.comp.var[1:18])
## [1] 0.9015231
plot(p.comp.var)
plot(cumsum(p.comp.var))

# Here we observe that reducing our data two first two components would explain the variance by 54% but in order to account for more variance -lets say 90%- we need to use at least 18 components.
par(mfrow=c(1,1))
cumpro <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
plot(cumpro[1:25], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 18, col="blue", lty=5)
abline(h = 0.9, col="blue", lty=5)

# Another look at our PCA components
#### Metric MDS ####
mds <- cmdscale(dist(muskdata[3:168]),eig=TRUE, k=2) # k is the number of dim
x <- mds$points[,1]
y <- mds$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Metric MDS", type="n")
text(x, y, labels = row.names(muskdata), cex=.7,col=muskdata$X1+1)

# We obtain the same plot we had when we plotted our data with respect to first two components in PCA part.
## 1.b
setwd("~/Downloads/UniStuff/IE 582/HW2")
muskdata <- read.csv(file="MUSK1.csv",header=TRUE)
means=NULL
for(i in 1:92){means = rbind(means,colMeans(subset(muskdata, X1.1 == i)))}
meanframe = data.frame(means)
avg_pca <- prcomp(means)
par(mfrow=c(1,2))
summary(avg_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 476.423 388.1926 315.6617 267.6392 208.86387
## Proportion of Variance 0.323 0.2144 0.1418 0.1019 0.06208
## Cumulative Proportion 0.323 0.5374 0.6792 0.7812 0.84323
## PC6 PC7 PC8 PC9 PC10
## Standard deviation 141.53526 132.95915 102.15680 97.4167 79.99774
## Proportion of Variance 0.02851 0.02516 0.01485 0.0135 0.00911
## Cumulative Proportion 0.87174 0.89689 0.91174 0.9253 0.93435
## PC11 PC12 PC13 PC14 PC15
## Standard deviation 78.88618 60.64198 56.35427 53.55273 51.85703
## Proportion of Variance 0.00886 0.00523 0.00452 0.00408 0.00383
## Cumulative Proportion 0.94321 0.94844 0.95296 0.95704 0.96087
## PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 50.9702 46.06380 43.86342 41.64074 39.3357 37.81769
## Proportion of Variance 0.0037 0.00302 0.00274 0.00247 0.0022 0.00204
## Cumulative Proportion 0.9646 0.96758 0.97032 0.97279 0.9750 0.97703
## PC22 PC23 PC24 PC25 PC26
## Standard deviation 36.43081 35.30017 32.76942 31.3236 30.31878
## Proportion of Variance 0.00189 0.00177 0.00153 0.0014 0.00131
## Cumulative Proportion 0.97892 0.98069 0.98222 0.9836 0.98492
## PC27 PC28 PC29 PC30 PC31
## Standard deviation 28.83568 28.54947 27.69744 26.08912 24.06399
## Proportion of Variance 0.00118 0.00116 0.00109 0.00097 0.00082
## Cumulative Proportion 0.98610 0.98726 0.98836 0.98932 0.99015
## PC32 PC33 PC34 PC35 PC36
## Standard deviation 24.00827 22.65363 21.15223 20.99901 19.79506
## Proportion of Variance 0.00082 0.00073 0.00064 0.00063 0.00056
## Cumulative Proportion 0.99097 0.99170 0.99234 0.99296 0.99352
## PC37 PC38 PC39 PC40 PC41
## Standard deviation 19.33216 17.99780 17.64630 16.54225 16.32596
## Proportion of Variance 0.00053 0.00046 0.00044 0.00039 0.00038
## Cumulative Proportion 0.99405 0.99451 0.99496 0.99535 0.99573
## PC42 PC43 PC44 PC45 PC46
## Standard deviation 15.89539 15.77299 14.17232 13.52522 13.37045
## Proportion of Variance 0.00036 0.00035 0.00029 0.00026 0.00025
## Cumulative Proportion 0.99608 0.99644 0.99672 0.99699 0.99724
## PC47 PC48 PC49 PC50 PC51
## Standard deviation 12.89266 12.38336 11.8089 11.46886 11.27417
## Proportion of Variance 0.00024 0.00022 0.0002 0.00019 0.00018
## Cumulative Proportion 0.99748 0.99769 0.9979 0.99808 0.99826
## PC52 PC53 PC54 PC55 PC56 PC57
## Standard deviation 10.76037 10.31929 9.56433 9.46493 8.89280 8.4668
## Proportion of Variance 0.00016 0.00015 0.00013 0.00013 0.00011 0.0001
## Cumulative Proportion 0.99843 0.99858 0.99871 0.99883 0.99895 0.9990
## PC58 PC59 PC60 PC61 PC62 PC63
## Standard deviation 8.14739 7.61831 7.27496 7.09337 6.88045 6.37558
## Proportion of Variance 0.00009 0.00008 0.00008 0.00007 0.00007 0.00006
## Cumulative Proportion 0.99914 0.99923 0.99930 0.99937 0.99944 0.99950
## PC64 PC65 PC66 PC67 PC68 PC69
## Standard deviation 6.04981 5.66512 5.43414 5.32847 4.91153 4.70723
## Proportion of Variance 0.00005 0.00005 0.00004 0.00004 0.00003 0.00003
## Cumulative Proportion 0.99955 0.99960 0.99964 0.99968 0.99971 0.99974
## PC70 PC71 PC72 PC73 PC74 PC75
## Standard deviation 4.55197 4.36495 4.00033 3.84315 3.77200 3.67564
## Proportion of Variance 0.00003 0.00003 0.00002 0.00002 0.00002 0.00002
## Cumulative Proportion 0.99977 0.99980 0.99982 0.99984 0.99987 0.99988
## PC76 PC77 PC78 PC79 PC80 PC81
## Standard deviation 3.34858 3.15935 2.97065 2.76911 2.56367 2.50420
## Proportion of Variance 0.00002 0.00001 0.00001 0.00001 0.00001 0.00001
## Cumulative Proportion 0.99990 0.99991 0.99993 0.99994 0.99995 0.99996
## PC82 PC83 PC84 PC85 PC86 PC87 PC88
## Standard deviation 2.16672 2.15720 2.07444 2.02640 1.797 1.64 1.553
## Proportion of Variance 0.00001 0.00001 0.00001 0.00001 0.000 0.00 0.000
## Cumulative Proportion 0.99996 0.99997 0.99998 0.99998 1.000 1.00 1.000
## PC89 PC90 PC91 PC92
## Standard deviation 1.479 1.209 1.029 4.645e-14
## Proportion of Variance 0.000 0.000 0.000 0.000e+00
## Cumulative Proportion 1.000 1.000 1.000 1.000e+00
ggbiplot(avg_pca)

avg_mds <- cmdscale(dist(meanframe[3:168]),eig=TRUE, k=2) # k is the number of dim
x <- avg_mds$points[,1]
y <- avg_mds$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Metric MDS, Averaged frame", type="p",col=meanframe$X1+1
)
text(x, y, labels = row.names(muskdata), cex=.7,col=meanframe$X1+1)
par(mfrow=c(1,1))

# Here, we observed a different distribution. Three main clusters are still present with two of them having mixed bags with respect to their labeling just like the previous part. The other main cluster which was consisted solely of positive labeled data in the previous part, happened to enter into a region of heterogeinity.
# The cause behind this may be the loss of information resulted by aggregating individual instances to have a group level inference.
# Thus we can comment that taking the average among instances might not be the best idea as it cause information loss.
### Task 2
# 2.1
library("imager")
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following object is masked from 'package:grid':
##
## depth
## The following object is masked from 'package:plyr':
##
## liply
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
img <- load.image("newyear.jpg")
plot(img)

# 2.2
minred = min(img[1:256,1:256,1])
mingreen = min(img[1:256,1:256,2])
minblue = min(img[1:256,1:256,3])
maxred = max(img[1:256,1:256,1])
maxgreen = max(img[1:256,1:256,2])
maxblue = max(img[1:256,1:256,3])
noise1<-matrix(runif(256*256,minred,maxred*0.1),nrow=256)
noise2<-matrix(runif(256*256,mingreen,maxgreen*0.1),nrow=256)
noise3<-matrix(runif(256*256,minblue,maxblue*0.1),nrow=256)
par(mfrow=c(1,3))
img[,,1]=img[,,1]+noise1
img[,,2]=img[,,2]+noise2
img[,,3]=img[,,3]+noise3
par(mfrow=c(1,1))
plot(img)

# Noisy image was created by taking minimum and maximum pixel values for each channel into account.
# Each noise value in a channel is distributed uniformly between minimum pixel value of that channel and 0.1 times maximum pixel value.
# Three noise matrices, one for each channel, was created as instructed and merged into the original image.
par(mfrow=c(1,3))
image(img[,,1])
image(img[,,2])
image(img[,,3])

par(mfrow=c(1,3))
image(img[,,1], col = grey(seq(0, 1, length = 200)))
image(img[,,2], col = grey(seq(0, 1, length = 200)))
image(img[,,3], col = grey(seq(0, 1, length = 200)))

# Origin in a loaded image refers to the top left corner. However, while displaying a matrix with image() function, origin is displayed at the bottom left. Thus, we have a rotated image. I will leave these ones for show but will rotate the rest of the images from now on via a newly defined rotate() function or via sorting matrix byrow=TRUE.
# Note that images above displays each channel seperately taking only the values into account.
# If color is also desired to be displayed, one can present this via equalling values in other channels (apart from channel to be displayed) into zero and display the image.
# 2.3
# 2.3.a
gray.img=grayscale(img)
par(mfrow=c(1,1))
plot(gray.img)

patchdim = 25
patches <- matrix(, nrow = (256-(patchdim-1))*(256-(patchdim-1)), ncol = patchdim^2)
k=1
for(i in (1+(patchdim-1)/2):(256-(patchdim-1)/2))
for(j in (1+(patchdim-1)/2):(256-(patchdim-1)/2))
{patches[k,]=gray.img[(i-(patchdim-1)/2):(i+(patchdim-1)/2),(j-(patchdim-1)/2):(j+(patchdim-1)/2),,]
k=k+1}
# A for loop was constructed in order to extract patches from the image.
# A variable called patchdim was defined in order to try different patch sizes.
pca <- princomp(patches, center = TRUE, scale = TRUE)
## Warning: In princomp.default(patches, center = TRUE, scale = TRUE) :
## extra arguments 'center', 'scale' will be disregarded
par(mfrow=c(1,3))
plot(pca$scores[,1],pca$scores[,2])
comp.var <- (pca$sdev)^2
p.comp.var <- comp.var/(sum(comp.var))
p.comp.var[1:10]
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## 0.765726669 0.081080823 0.040531882 0.024071144 0.015034930 0.009101686
## Comp.7 Comp.8 Comp.9 Comp.10
## 0.008774755 0.005233861 0.004498384 0.003984406
cumsum(p.comp.var)[1:10]
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## 0.7657267 0.8468075 0.8873394 0.9114105 0.9264454 0.9355471 0.9443219
## Comp.8 Comp.9 Comp.10
## 0.9495557 0.9540541 0.9580385
plot(p.comp.var)
plot(cumsum(p.comp.var))

# We see that first component alone can explain more than 3/4 of the variance and first three components are enough to explain nearly 9/10 of the variance of the data.
# 2.3.b
rotate <- function(x) t(apply(x, 2, rev))
par(mfrow=c(1,3))
firstscore=pca$scores[,1]
firstcomp.m=matrix(firstscore,nrow=(256-(patchdim-1)))
image(rotate(firstcomp.m), col = grey(seq(0, 1, length = 200)))
secondscore=pca$scores[,2]
seccomp.m=matrix(secondscore,nrow=(256-(patchdim-1)))
image(rotate(seccomp.m), col = grey(seq(0, 1, length = 200)))
thirdscore=pca$scores[,3]
thirdcomp.m=matrix(thirdscore,nrow=(256-(patchdim-1)))
image(rotate(thirdcomp.m), col = grey(seq(0, 1, length = 200)))

# Scores of the first component provides a good silhouette, as if we see the image with a myopic POV.
# Second and third components, expectedly, provides a less clear image compared to the precision of the first component.
# Also, I tried different patch sizes, at 5x5, scores of the first component provides more than a silhouette. Smaller patch sizes would expectedly provide a clearer dislay as it would converge to the original image.
# 2.3.c
par(mfrow=c(1,3))
image(matrix(pca$loadings[,1],nrow=patchdim,byrow=TRUE),col = grey(seq(0, 1, length = 200)))
image(matrix(pca$loadings[,2],nrow=patchdim,byrow=TRUE),col = grey(seq(0, 1, length = 200)))
image(matrix(pca$loadings[,3],nrow=patchdim,byrow=TRUE),col = grey(seq(0, 1, length = 200)))

# Eigenvectors are extracted from PCA via $loadings.
# In these plots, bolder regions refer to a higher value of an eigenvector than clear regions.
# Each component displays the part of the patch corresponding to their bolder regions more accurately than other parts.
# For example, first component mostly explains the information on the edges of an image, a patch.